Loading libraries
library(dplyr)
library(ggplot2)
library(plotly)
library(tidyr)
Reading data
data <- read.csv2('./all_summary.csv', nrows = 10000)
dim(data)
## [1] 10000 412
Deleting chosen ligands
deletable_res_name <- c("UNK", "UNX", "UNL", "DUM", "N", "BLOB", "ALA", "ARG", "ASN", "ASP", "CYS", "GLN", "GLU", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "MSE", "PHE", "PRO", "SEC", "SER", "THR", "TRP", "TYR", "VAL", "DA", "DG", "DT", "DC", "DU", "A", "G", "T", "C", "U", "HOH", "H20", "WAT")
data <- data[!data$res_name %in% deletable_res_name,]
dim(data)
## [1] 9940 412
Processing missing data
#data <- data[complete.cases(data), ]
#dim(data)
Data summary
#knitr::kable(summary(data))
dim(data)
## [1] 9940 412
50 most popular ligands
# finding 50 most popular res_names
popular_names <- data %>%
select(res_name) %>%
group_by(res_name) %>%
summarise(cardinality = n()) %>%
arrange(-cardinality) %>%
slice(1:50)
# converting data frame format
popular_names_list <- as.list(popular_names[,"res_name"])$res_name
#limitation to popular ligands
data <- data[data$res_name %in% popular_names_list, ]
dim(data)
## [1] 6789 412
Cardinality of ligands by name
plot <- ggplot(popular_names, aes(x = reorder(res_name, -cardinality), y = cardinality, fill = cardinality)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90)) +
xlab("ligand")+
labs(title = "Cardinality of ligands by name")
ggplotly(plot)
Distribution of atom and electron count
plot_atom <- ggplot(data, aes(x = local_res_atom_non_h_count)) +
geom_density(alpha = .3, fill = "#00CECB", color = NA) +
xlab("atom count") +
labs(title = "Atom count distribution")
ggplotly(plot_atom)
plot_electron <- ggplot(data, aes(x = local_res_atom_non_h_electron_sum)) +
geom_density(alpha = .3, fill = "#FF5E5B", color = NA) +
xlab("electron count") +
labs(title = "Electron count distribution")
ggplotly(plot_electron)